###############################################################################
#                  EXAMPLE: Repository Collection & Extraction
###############################################################################
#
# This script shows how to collect and extract code from GitHub
# repositories. It uses protobuf/protobuf as a small example repository.
#
# The workflow has two main steps:
#   1. COLLECTION: Clone the repository (bare clone, no working tree)
#   2. EXTRACTION: Extract code insertions from each commit
#
###############################################################################

library(stringi)

# ============================================================================
# CONFIGURATION - Modify these paths as needed
# ============================================================================

# Working directory where repos will be cloned
WD <- tempdir() # Uses temp directory by default (clean up automatically)
# Alternatively, set a persistent directory:
# WD <- "/path/to/your/repos"

# Output directory for extracted code
output_dir <- tempdir()
# Alternatively:
# output_dir <- "/path/to/your/output"

# Example repository to process (protobuf is relatively small)
EXAMPLE_REPO <- "caddyserver/caddy"

# ============================================================================
# STEP 1: REPOSITORY COLLECTION (CLONING)
# ============================================================================
# This section shows how to clone a GitHub repository as a bare repository.
# A bare clone contains only the git history, not the working files,
# which is sufficient for extracting commit information.

message("\n========================================")
message("STEP 1: Repository Collection (Cloning)")
message("========================================\n")

# Set working directory
setwd(WD)
message(sprintf("Working directory: %s", WD))

# Extract repository folder name from the repo path
repo_folder <- paste0(stri_split_fixed(EXAMPLE_REPO, "/")[[1]][2], ".git")
message(sprintf("Repository folder: %s", repo_folder))

# Check if already cloned
if (file.exists(repo_folder)) {
    message("Repository already cloned, skipping...")
} else {
    message(sprintf("Cloning %s...", EXAMPLE_REPO))

    # Construct the git clone command
    # --bare: Clone without working directory (saves space)
    # --no-checkout: Don't checkout HEAD (not needed for bare repos)
    clone_cmd <- sprintf(
        "git clone --bare --no-checkout https://github.com/%s.git",
        EXAMPLE_REPO
    )

    message(sprintf("Running: %s", clone_cmd))

    # Execute the clone command
    result <- system(clone_cmd, intern = FALSE)

    if (result == 0) {
        message("Clone successful!")
    } else {
        stop("Clone failed!")
    }
}

# ============================================================================
# STEP 2: EXTRACTION - Get code insertions from commits
# ============================================================================
# This section extracts the lines of code added in each commit.

message("\n========================================")
message("STEP 2: Code Extraction from Commits")
message("========================================\n")

repo_path <- file.path(WD, repo_folder)
message(sprintf("Repository path: %s", repo_path))

# -------------------------
# 2.1: Get all commit hashes
# -------------------------
message("\n--- Getting commit hashes ---")

get_commit_hashes <- function(repo_path, max_commits = NULL) {
    # git rev-list --all lists all commits in the repository
    git_cmd <- sprintf("git -C %s rev-list --all", shQuote(repo_path))

    commits <- tryCatch(
        system(git_cmd, intern = TRUE, ignore.stderr = TRUE),
        error = function(e) NULL
    )

    if (!is.null(max_commits) && length(commits) > max_commits) {
        commits <- commits[1:max_commits]
    }

    return(commits)
}

# For this demo, we'll only process first 50 commits to keep it fast
# Remove the max_commits argument to process all commits
all_commits <- get_commit_hashes(repo_path, max_commits = 50)
message(sprintf("Found %d commits (limited to 50 for demo)", length(all_commits)))

# Show first few commit hashes
message("\nFirst 5 commit hashes:")
print(head(all_commits, 5))

# -------------------------
# 2.2: Extract insertions from a single commit
# -------------------------
message("\n--- Extracting code insertions ---")

extract_insertions <- function(repo_path, commit_hash) {
    # git show displays the diff for a specific commit
    # --no-notes: excludes git notes
    # --format=: empty format to skip commit message header
    # --cc: shows combined diff for merge commits (includes conflict resolutions)
    git_cmd <- sprintf(
        "git -C %s show --cc --no-notes --format= %s",
        shQuote(repo_path),
        commit_hash
    )

    raw_diff <- tryCatch(
        system(git_cmd, intern = TRUE, ignore.stderr = TRUE),
        error = function(e) NULL
    )

    if (is.null(raw_diff) || length(raw_diff) == 0) {
        return(NULL)
    }

    # Extract added lines: lines starting with + but not ++ (diff header)
    # The regex "^\\+[^+]" matches lines that:
    #   - Start with a + sign (^\\+)
    #   - Are NOT followed by another + ([^+])
    # This excludes diff headers like +++ b/file.txt
    insertions <- grep("^\\+[^+]", raw_diff, value = TRUE)

    if (length(insertions) > 0) {
        return(data.frame(
            commit = commit_hash,
            code = insertions,
            stringsAsFactors = FALSE
        ))
    }
    return(NULL)
}

# -------------------------
# 2.3: Process all commits
# -------------------------
message("\nProcessing commits...")

# Process each commit and collect results
results_list <- lapply(seq_along(all_commits), function(i) {
    if (i %% 10 == 0) {
        message(sprintf("  Processing commit %d/%d...", i, length(all_commits)))
    }
    extract_insertions(repo_path, all_commits[i])
})

# Combine all results into one data frame
all_insertions <- do.call(rbind, results_list)

message(sprintf("\nExtraction complete!"))
message(sprintf("Total insertions extracted: %d", nrow(all_insertions)))
message(sprintf("Unique commits with code: %d", length(unique(all_insertions$commit))))

# -------------------------
# 2.4: Preview the results
# -------------------------
message("\n--- Preview of extracted data ---")

# Show structure
message("\nData structure:")
str(all_insertions)

# Show sample
message("\nSample of extracted code (first 10 rows):")
print(head(all_insertions, 10))

# Show unique commits
message("\nUnique commits:")
print(head(unique(all_insertions$commit), 5))

